# coding=utf-8
import re
import requests

HREADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
}


def parse_url(url):
    response = requests.get(url, headers=HREADERS)
    text = response.text
    titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
    authors = re.findall(r'<p\sclass="source".*?<a.*?>(.*?)</a>', text, re.DOTALL)
    dynastys = re.findall(r'<p\sclass="source".*?<a.*?>.*?</a><a.*?>(.*?)</a>', text, re.DOTALL)
    contents = re.findall(r'<div\sclass="contson".*?>(.*?)</div', text, re.DOTALL)
    contents_rel = []
    for con in contents:
        x = re.sub(r'<.*?>', "", con)
        contents_rel.append(x.strip())
    all_data = []
    for val in zip(titles, authors, dynastys, contents_rel):
        title, author, dynasty, content = val
        all_data.append(
            {
                "title": title,
                "author": author,
                "dynasty": dynasty,
                "content": content
            }
        )

    print(all_data)


if __name__ == '__main__':
    urls = [
        "https://www.gushiwen.cn/default_2.aspx"
    ]

    for url in urls:
        parse_url(url)
